{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "gpuType": "T4",
      "authorship_tag": "ABX9TyOS2QEuJ1BDI/3IFsLsFIZo",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/mlabonne/llm-course/blob/main/4_bit_LLM_Quantization_with_GPTQ.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "# 4-bit LLM Quantization with GPTQ\n",
        "> 🗣️ [Large Language Model Course](https://github.com/mlabonne/llm-course)\n",
        "\n",
        "❤️ Created by [@maximelabonne](https://twitter.com/maximelabonne).\n",
        "\n",
        "Companion notebook to execute the code from the following article: https://mlabonne.github.io/blog/4bit_quantization/"
      ],
      "metadata": {
        "id": "yezrHxYvg_wR"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "BhufqqQAaz6e"
      },
      "outputs": [],
      "source": [
        "!BUILD_CUDA_EXT=0 pip install -q auto-gptq transformers"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import random\n",
        "\n",
        "from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig\n",
        "from datasets import load_dataset\n",
        "import torch\n",
        "from transformers import AutoTokenizer\n",
        "\n",
        "\n",
        "# Define base model and output directory\n",
        "model_id = \"gpt2\"\n",
        "out_dir = model_id + \"-GPTQ\""
      ],
      "metadata": {
        "id": "dg8NyBL0ZNyw"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Load quantize config, model and tokenizer\n",
        "quantize_config = BaseQuantizeConfig(\n",
        "    bits=4,\n",
        "    group_size=128,\n",
        "    damp_percent=0.01,\n",
        "    desc_act=False,\n",
        ")\n",
        "model = AutoGPTQForCausalLM.from_pretrained(model_id, quantize_config)\n",
        "tokenizer = AutoTokenizer.from_pretrained(model_id)"
      ],
      "metadata": {
        "id": "C9352jN0ZP6I"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Load data and tokenize examples\n",
        "n_samples = 1024\n",
        "data = load_dataset(\"allenai/c4\", data_files=\"en/c4-train.00001-of-01024.json.gz\", split=f\"train[:{n_samples*5}]\")\n",
        "tokenized_data = tokenizer(\"\\n\\n\".join(data['text']), return_tensors='pt')\n",
        "\n",
        "# Format tokenized examples\n",
        "examples_ids = []\n",
        "for _ in range(n_samples):\n",
        "    i = random.randint(0, tokenized_data.input_ids.shape[1] - tokenizer.model_max_length - 1)\n",
        "    j = i + tokenizer.model_max_length\n",
        "    input_ids = tokenized_data.input_ids[:, i:j]\n",
        "    attention_mask = torch.ones_like(input_ids)\n",
        "    examples_ids.append({'input_ids': input_ids, 'attention_mask': attention_mask})"
      ],
      "metadata": {
        "id": "6wuBLe6aZSe-",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "e4ebd71a-2854-4347-cebe-08cf040d1eb6"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "WARNING:datasets.builder:Found cached dataset json (/root/.cache/huggingface/datasets/allenai___json/allenai--c4-6e494e9c0ee1404e/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)\n",
            "Token indices sequence length is longer than the specified maximum sequence length for this model (2441065 > 1024). Running this sequence through the model will result in indexing errors\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "%%time\n",
        "\n",
        "# Quantize with GPTQ\n",
        "model.quantize(\n",
        "    examples_ids,\n",
        "    batch_size=1,\n",
        "    use_triton=True,\n",
        ")\n",
        "\n",
        "# Save model and tokenizer\n",
        "model.save_quantized(out_dir, use_safetensors=True)\n",
        "tokenizer.save_pretrained(out_dir)"
      ],
      "metadata": {
        "id": "ETsG2iYrXaUg",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "e48b825e-0ebc-4a73-dbfd-b5571cafd24e"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "CPU times: user 4min 35s, sys: 3.49 s, total: 4min 39s\n",
            "Wall time: 5min 8s\n"
          ]
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "('gpt2-GPTQ/tokenizer_config.json',\n",
              " 'gpt2-GPTQ/special_tokens_map.json',\n",
              " 'gpt2-GPTQ/vocab.json',\n",
              " 'gpt2-GPTQ/merges.txt',\n",
              " 'gpt2-GPTQ/added_tokens.json',\n",
              " 'gpt2-GPTQ/tokenizer.json')"
            ]
          },
          "metadata": {},
          "execution_count": 5
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
        "\n",
        "# Reload model and tokenizer\n",
        "model = AutoGPTQForCausalLM.from_quantized(\n",
        "    out_dir,\n",
        "    device=device,\n",
        "    use_triton=True,\n",
        "    use_safetensors=True,\n",
        ")\n",
        "tokenizer = AutoTokenizer.from_pretrained(out_dir)"
      ],
      "metadata": {
        "id": "nktu1FsdZ9sd",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "9943c829-1b58-474a-f245-6aefa09d81dc"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "WARNING:accelerate.utils.modeling:The safetensors archive passed at gpt2-GPTQ/gptq_model-4bit-128g.safetensors does not contain metadata. Make sure to save your model with the `save_pretrained` method. Defaulting to 'pt' metadata.\n",
            "WARNING:auto_gptq.modeling._base:GPT2GPTQForCausalLM hasn't fused attention module yet, will skip inject fused attention.\n",
            "WARNING:auto_gptq.modeling._base:GPT2GPTQForCausalLM hasn't fused mlp module yet, will skip inject fused mlp.\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from transformers import pipeline\n",
        "\n",
        "generator = pipeline('text-generation', model=model, tokenizer=tokenizer)\n",
        "result = generator(\"I have a dream\", do_sample=True, max_length=50)[0]['generated_text']\n",
        "print(result)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "cRhIGrXdiFdt",
        "outputId": "6dca2078-6f01-44da-9895-3a03bdfb4b5b"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "The model 'GPT2GPTQForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'ReformerModelWithLMHead', 'RemBertForCausalLM', 'RobertaForCausalLM', 'RobertaPreLayerNormForCausalLM', 'RoCBertForCausalLM', 'RoFormerForCausalLM', 'RwkvForCausalLM', 'Speech2Text2ForCausalLM', 'TransfoXLLMHeadModel', 'TrOCRForCausalLM', 'XGLMForCausalLM', 'XLMWithLMHeadModel', 'XLMProphetNetForCausalLM', 'XLMRobertaForCausalLM', 'XLMRobertaXLForCausalLM', 'XLNetLMHeadModel', 'XmodForCausalLM'].\n",
            "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "I have a dream,\" she told CNN last week. \"I have this dream of helping my mother find her own. But, to tell that for the first time, now that I'm seeing my mother now, just knowing how wonderful it is that\n"
          ]
        }
      ]
    }
  ]
}